import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# !pip install pandas_profiling;
# !pip install emot;
# !pip install ipywidgets;
# !pip install -U kaleido
df_path = os.path.join('..', 'data', 'train.csv')
df = pd.read_csv(df_path)
print(f"SHAPE: {df.shape}")
df.head(3)
SHAPE: (7613, 5)
| id | keyword | location | text | target | |
|---|---|---|---|---|---|
| 0 | 1 | NaN | NaN | Our Deeds are the Reason of this #earthquake M... | 1 |
| 1 | 4 | NaN | NaN | Forest fire near La Ronge Sask. Canada | 1 |
| 2 | 5 | NaN | NaN | All residents asked to 'shelter in place' are ... | 1 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7613 entries, 0 to 7612 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 7613 non-null int64 1 keyword 7552 non-null object 2 location 5080 non-null object 3 text 7613 non-null object 4 target 7613 non-null int64 dtypes: int64(2), object(3) memory usage: 297.5+ KB
# Поиск дубликатов
print(f"Dplicates: {df.duplicated().any()}")
Dplicates: False
print(df['location'].unique())
[nan 'Birmingham' 'Est. September 2012 - Bristol' ... 'Vancouver, Canada' 'London ' 'Lincoln']
# Поиск аномалий (Сообщений с одинаковым текстом но разными метками)
df_mislabeled = df.groupby(['text']).nunique().sort_values(by='target', ascending=False)
df_mislabeled = df_mislabeled[df_mislabeled['target'] > 1]['target']
df_mislabeled
text like for the music video I want some real action shit like burning buildings and police chases not some weak ben winston shit 2 Hellfire! We donÛªt even want to think about it or mention it so letÛªs not do anything that leads to it #islam! 2 The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.' 2 In #islam saving a person is equal in reward to saving all humans! Islam is the opposite of terrorism! 2 To fight bioterrorism sir. 2 Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her\n \n#FARRAKHAN #QUOTE 2 #foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption 2 #Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect 2 He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam 2 RT NotExplained: The only known image of infamous hijacker D.B. Cooper. http://t.co/JlzK2HdeTG 2 Hellfire is surrounded by desires so be careful and donÛªt let your desires control you! #Afterlife 2 CLEARED:incident with injury:I-495 inner loop Exit 31 - MD 97/Georgia Ave Silver Spring 2 Mmmmmm I'm burning.... I'm burning buildings I'm building.... Oooooohhhh oooh ooh... 2 wowo--=== 12000 Nigerian refugees repatriated from Cameroon 2 .POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4 2 Caution: breathing may be hazardous to your health. 2 I Pledge Allegiance To The P.O.P.E. And The Burning Buildings of Epic City. ?????? 2 that horrible sinking feeling when youÛªve been at home on your phone for a while and you realise its been on 3G this whole time 2 Name: target, dtype: int64
# Удаление выбросов
df = df[~df['text'].isin(df_mislabeled.index.to_list())]
df.reset_index(inplace=True)
print(f"Shape: {df.shape}")
Shape: (7558, 6)
fig = px.pie(df, names='target', title='Соотношение классов таргета')
fig.show()
fig = px.histogram(data_frame=df.sort_values('target'), x="keyword", color='target')
fig.update_xaxes(categoryorder='min ascending')
# fig.update_xaxes(tickangle = -90)|
fig.show()
fig, ax = plt.subplots(figsize=(10, 7))
sns.countplot(
ax = ax,
y = df['location'],
order = df['location'].value_counts().iloc[:15].index,
)
ax.set_title('Top 25 location from the tweets')
plt.grid()
plt.show()
# Конвертация эмодзи и текстовых смайликов
import re
from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO
from pandas_profiling import ProfileReport
# Конвертация эмодзи в слова
def convert_emojis(text):
for emot in UNICODE_EMOJI:
text = text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",","").replace(":","").split()))
# text = text.replace(emot, "")
return text
# Converting emoticons to words
def convert_emoticons(text):
for emot in EMOTICONS_EMO:
text = text.replace(emot, EMOTICONS_EMO[emot].replace(" ","_"))
# text = text.replace(emot, "")
return text
import string
def clean_text(text: str) -> str:
text = str(text).lower()
clean_text = text.replace('{html}',"") # removing html files
# clean_text = re.sub('<.*?>', '', clean_text) # Removing punctuation
clean_text = re.sub(fr'[{string.punctuation}]', '', clean_text) # Removing punctuation
clean_text = re.sub(r'http\S+', '',clean_text) # Removing links
clean_text = re.sub('[0-9]+', '', clean_text) # Removing numbers
return clean_text
import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
# from pymystem3 import Mystem
# nltk.download('wordnet')
nltk.download('stopwords')
en_stopwords = set(stopwords.words("english"))
wnl = WordNetLemmatizer()
def lemmatize_text(text: str):
return " ".join([wnl.lemmatize(w) for w in text.split() if w not in en_stopwords])
[nltk_data] Downloading package wordnet to [nltk_data] C:\Users\Alan\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package stopwords to [nltk_data] C:\Users\Alan\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
df['text'] = df['text'].apply(convert_emoticons)
df['text'] = df['text'].apply(convert_emojis)
df['clean_text'] = df['text'].apply(lemmatize_text)
df['clean_text'] = df['clean_text'].apply(clean_text)
df['keyword'].fillna('UNKNOW', inplace=True)
df['location'].fillna('UNKNOW', inplace=True)
df['text_len'] = df['clean_text'].apply(len)
from collections import Counter
# Removing the frequent words
cnt = Counter()
for text in df["clean_text"].values:
for word in text.split():
cnt[word] += 1
px.bar(x=[w for w, c in cnt.most_common()[:100]], y=[c for w, c in cnt.most_common()[:100]])
cnt.most_common(15)
[('i', 1255),
('the', 676),
('a', 342),
('like', 342),
('fire', 316),
('amp', 300),
('im', 293),
('get', 252),
('new', 224),
('via', 220),
('in', 218),
('one', 200),
('news', 197),
('people', 190),
('video', 170)]
freq = set([w for (w, wc) in cnt.most_common(2)])
# function to remove the frequent words
def freqwords(text):
return " ".join([word for word in str(text).split() if word not in freq])
# Passing the function freqwords
df["clean_text"] = df["clean_text"].apply(freqwords)
df["clean_text"].head()
0 our deeds reason earthquake may allah forgive u 1 forest fire near la ronge sask canada 2 all resident asked shelter place notified offi... 3 people receive wildfires evacuation order cali... 4 just got sent photo ruby alaska smoke wildfire... Name: clean_text, dtype: object
df.head()
| index | id | keyword | location | text | target | clean_text | text_len | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | UNKNOW | UNKNOW | Our Deeds are the Reason of this #earthquake M... | 1 | our deeds reason earthquake may allah forgive u | 47 |
| 1 | 1 | 4 | UNKNOW | UNKNOW | Forest fire near La Ronge Sask. Canada | 1 | forest fire near la ronge sask canada | 37 |
| 2 | 2 | 5 | UNKNOW | UNKNOW | All residents asked to 'shelter in place' are ... | 1 | all resident asked shelter place notified offi... | 93 |
| 3 | 3 | 6 | UNKNOW | UNKNOW | 13,000 people receive #wildfires evacuation or... | 1 | people receive wildfires evacuation order cali... | 53 |
| 4 | 4 | 7 | UNKNOW | UNKNOW | Just got sent this photo from Ruby #Alaska as ... | 1 | just got sent photo ruby alaska smoke wildfire... | 60 |
profile = ProfileReport(df, html={'style':{'full_width':True}})
profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, f1_score, classification_report
# prepared_df = df[['clean_text', 'text_len', 'target']]
prepared_df = df[['keyword', 'location', 'clean_text', 'text_len', 'target']]
X_train, X_test, y_train, y_test = train_test_split(
prepared_df.drop('target', axis=1),
prepared_df['target'],
test_size=0.3,
random_state=42
)
print(f"SHAPES:\nX_train: {X_train.shape}\ty_train: {y_train.shape}\nX_test: {X_test.shape}\ty_test: {y_test.shape}")
SHAPES: X_train: (5290, 4) y_train: (5290,) X_test: (2268, 4) y_test: (2268,)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
class ModifiedLabelEncoder(LabelEncoder):
def __init__(self):
self.encoders = []
def fit_transform(self, X, *args, **kwargs):
for i, x in enumerate(X):
self.encoders.append(LabelEncoder())
X[i] = self.encoders[i].fit_transform(x)
return X
def transform(self, X, *args, **kwargs):
for i, x in enumerate(X):
X[i] = self.encoders[i].transform(x)
return X
def create_pipeline(model):
# CATEGORICAL
categorical_features = ['keyword', 'location']
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
('label_encoder', OneHotEncoder(handle_unknown='ignore')),
# ('label_encoder', ModifiedLabelEncoder())
])
# NUMERIC
numeric_features = ['text_len']
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])
# TEXT
text_features = ['clean_text']
text_transformer = Pipeline(steps=[
('vectorizer', TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True, max_features=20_000)),
('svd', TruncatedSVD(n_components=1000, random_state=42))
])
# PIPELINE
pipeline = Pipeline(steps=[
('preprocessor', ColumnTransformer(transformers=[
('cat', categorical_transformer, categorical_features),
('num', numeric_transformer, numeric_features),
('text', text_transformer, 'clean_text'),
], remainder='drop', verbose=False)),
('clf', model)
])
return pipeline
%%time
parameters = {
'clf__penalty':['l1', 'l2', 'elasticnet'],
'clf__C':[0.5, 1.0, 1.5, 2.0, 2.5, 3.0],
'clf__class_weight': [None, 'balanced'],
'clf__solver': ['liblinear', 'saga']
}
log_reg = LogisticRegression(random_state=42)
log_reg_pipe = create_pipeline(log_reg)
log_reg_clf = GridSearchCV(log_reg_pipe, parameters, scoring='f1', n_jobs=-1)
log_reg_clf.fit(X_train, y_train.values)
Wall time: 32min 21s
GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('label_encoder',
OneHotEncoder(handle_unknown='ignore'))]),
['keyword',
'location']),
('num',
Pipeline(steps=[('imputer',
SimpleImputer()),
('scaler',
StandardScaler())]),
['text_len']),
('text',
Pi...
TfidfVectorizer(max_features=20000,
ngram_range=(1,
3),
sublinear_tf=True)),
('svd',
TruncatedSVD(n_components=1000,
random_state=42))]),
'clean_text')])),
('clf',
LogisticRegression(random_state=42))]),
n_jobs=-1,
param_grid={'clf__C': [0.5, 1.0, 1.5, 2.0, 2.5, 3.0],
'clf__class_weight': [None, 'balanced'],
'clf__penalty': ['l1', 'l2', 'elasticnet'],
'clf__solver': ['liblinear', 'saga']},
scoring='f1')
log_reg_clf.best_estimator_, log_reg_clf.best_params_, log_reg_clf.best_score_,
(Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('label_encoder',
OneHotEncoder(handle_unknown='ignore'))]),
['keyword', 'location']),
('num',
Pipeline(steps=[('imputer',
SimpleImputer()),
('scaler',
StandardScaler())]),
['text_len']),
('text',
Pipeline(steps=[('vectorizer',
TfidfVectorizer(max_features=20000,
ngram_range=(1,
3),
sublinear_tf=True)),
('svd',
TruncatedSVD(n_components=1000,
random_state=42))]),
'clean_text')])),
('clf',
LogisticRegression(C=1.5, class_weight='balanced',
penalty='l1', random_state=42,
solver='liblinear'))]),
{'clf__C': 1.5,
'clf__class_weight': 'balanced',
'clf__penalty': 'l1',
'clf__solver': 'liblinear'},
0.7598162562575588)
preds = log_reg_clf.predict_proba(X_test)
print(f"roc_auc: {roc_auc_score(y_test, preds[:, 1])}")
preds = log_reg_clf.predict(X_test)
print(f"f1_score: {f1_score(y_test, preds)}")
print(classification_report(y_test, preds))
roc_auc: 0.8630197931860202
f1_score: 0.7621009268795057
precision recall f1-score support
0 0.83 0.81 0.82 1309
1 0.75 0.77 0.76 959
accuracy 0.80 2268
macro avg 0.79 0.79 0.79 2268
weighted avg 0.80 0.80 0.80 2268
from catboost import Pool
from catboost import CatBoostClassifier
train_pool = Pool(
data = X_train,
label = y_train,
cat_features = ['keyword', 'location'],
text_features = ['clean_text']
)
test_pool = Pool(
data = X_test,
label = y_test,
cat_features = ['keyword', 'location'],
text_features = ['clean_text'])
print('Train dataset shape: {}\n'.format(train_pool.shape))
Train dataset shape: (5290, 4)
def fit_model(train_pool, test_pool, **kwargs):
model = CatBoostClassifier(
iterations=1000,
learning_rate=0.05,
eval_metric='AUC',
**kwargs
)
return model.fit(
train_pool,
eval_set=test_pool,
verbose=500,
)
cb_model_clf = fit_model(train_pool, test_pool)
0: test: 0.7744655 best: 0.7744655 (0) total: 235ms remaining: 3m 54s 500: test: 0.8576487 best: 0.8576487 (500) total: 53s remaining: 52.8s 999: test: 0.8637670 best: 0.8639948 (974) total: 1m 45s remaining: 0us bestTest = 0.8639948348 bestIteration = 974 Shrink model to first 975 iterations.
preds = cb_model_clf.predict_proba(X_test)
print(f"roc_auc: {roc_auc_score(y_test, preds[:, 1])}")
preds = cb_model_clf.predict(X_test)
print(f"f1_score: {f1_score(y_test, preds)}")
print(classification_report(y_test, preds))
roc_auc: 0.8639948348284239
f1_score: 0.7557251908396948
precision recall f1-score support
0 0.81 0.86 0.83 1309
1 0.79 0.72 0.76 959
accuracy 0.80 2268
macro avg 0.80 0.79 0.79 2268
weighted avg 0.80 0.80 0.80 2268
%%time
parameters = {
'clf__C':[0.5, 0.7, 1.0, 1.3, 5.0, 10.0],
'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
'clf__class_weight': [None, 'balanced']
}
svc = svm.SVC(random_state=42, probability=True)
svm_pipe = create_pipeline(svc)
svm_clf = GridSearchCV(svm_pipe, parameters, scoring='roc_auc', n_jobs=-2)
svm_clf.fit(X_train, y_train.values)
Wall time: 4h 23min 51s
GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('label_encoder',
OneHotEncoder(handle_unknown='ignore'))]),
['keyword',
'location']),
('num',
Pipeline(steps=[('imputer',
SimpleImputer()),
('scaler',
StandardScaler())]),
['text_len']),
('text',
Pi...
TfidfVectorizer(max_features=20000,
ngram_range=(1,
2),
sublinear_tf=True)),
('svd',
TruncatedSVD(n_components=1000,
random_state=42))]),
'clean_text')])),
('clf',
SVC(probability=True,
random_state=42))]),
n_jobs=-2,
param_grid={'clf__C': [0.5, 0.7, 1.0, 1.3, 5.0, 10.0],
'clf__class_weight': [None, 'balanced'],
'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
scoring='roc_auc')
# svm_clf.best_estimator_, svm_clf.best_params_, svm_clf.best_score_,
preds = svm_clf.predict_proba(X_test)
roc_auc_score(y_test, preds[:, 1])
0.8612943518482377
preds = model.predict(X_test)
print(classification_report(y_test, preds))
precision recall f1-score support
0 0.79 0.90 0.84 1309
1 0.83 0.68 0.75 959
accuracy 0.81 2268
macro avg 0.81 0.79 0.79 2268
weighted avg 0.81 0.81 0.80 2268